The “Individual household electric power consumption” Data Set

The dataset contains data from three power sub-meters installed in a house near central France, each measures specific areas of the house:

Initial Data visualization

The original data set comes with Date and Time columns (both as string), lets convert them into a single field called DateTime:

# Merge datasets
power_df <-
  bind_rows(power_2007, power_2008, power_2009)

# Create DateTime Column
## Combine Date and Time attribute values in a new attribute column
power_df <-cbind(power_df,paste(power_df$Date,power_df$Time), stringsAsFactors=FALSE)

## Give the new attribute in the 6th column a header name 
## NOTE: if you downloaded more than 5 attributes you will need to change the column number)
# str(power_df)
colnames(power_df)[11] <-"DateTime"
# Move the DateTime column to be the first column (for convenience)
power_df <- power_df[,c(ncol(power_df), 1:(ncol(power_df)-1))]
power_df

The DateTime column is still a String, in order to use it’s time components needs to be converted into a POSIXct date type.

## Convert DateTime from character to POSIXct 
power_df$DateTime <- as.POSIXct(power_df$DateTime, tz="Europe/Paris", format="%Y-%m-%d %H:%M:%S")
## Add the time zone
attr(power_df$DateTime, "tzone") <- "Europe/Paris"
power_df

Now, it’s possible to decompose the DateTime field into separate components for further analysis later on.

## Create "year" attribute with lubridate
power_df$year <- year(power_df$DateTime)
power_df$quarter <- quarter(power_df$DateTime)
power_df$month <- month(power_df$DateTime)
power_df$week <- week(power_df$DateTime)
power_df$weekday <- wday(power_df$DateTime, week_start=1)
power_df$day <- day(power_df$DateTime)
power_df$hour <- hour(power_df$DateTime)
power_df$minute <- minute(power_df$DateTime)

power_df

Let’s try to visualize a single week:

## Subset the second week of 2008 - All Observations
houseWeek <- filter(power_df, year == 2008 & week == 2)
## Plot subset houseWeek
plot(houseWeek$Sub_metering_1)

There are too many data points displayed, therefore is difficult to infer anything from it, let’s zoom in into a single day:

# Visualize a Single Day with Plotly
## Subset the 9th day of January 2008 - All observations
houseDay <- filter(power_df, year == 2008 & month == 1 & day == 9)
## Plot sub-meter 1
plot_ly(houseDay, x = ~houseDay$Time, y = ~houseDay$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 layout(title = "Power Consumption January 9Th, 2008",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

The visualization is better, so let’s add the three sub-meters to the graph:

## Plot sub-meter 1, 2 and 3 with title, legend and labels - All observations 
plot_ly(houseDay, x = ~houseDay$DateTime, y = ~houseDay$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseDay$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 add_trace(y = ~houseDay$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
 layout(title = "Power Consumption January 9Th, 2008",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

We can reduce the granularity from one observation per minute to one observation every 10 minutes to get a better plot, this will be called a Day visualization:

## Subset the 9th day of January 2008 - 10 Minute frequency
houseDay10 <- filter(power_df, year == 2008 & month == 1 & day == 9 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))

## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10, x = ~houseDay10$DateTime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
 layout(title = "Power Consumption January 9Th, 2008",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

Now let’s look at another time period: the 3rd week of April 2009 - 10 Minute frequency

## Subset the 3rd week of April 2009 - 10 Minute frequency
houseWeek16 <- power_df %>%
  filter(year == 2009 & month == 4 & week == 16 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))

## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
 layout(title = "Power Consumption April 16Th-22Nd, 2009",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

Too many samples, let’s reduce them to one per hour:

## Subset the 3rd week of April 2009 - 60 Minute frequency
houseWeek16 <- power_df %>%
  filter(year == 2009 & month == 4 & week == 16 & (minute == 0))

## Plot sub-meter 1, 2 and 3 with title, legend and labels - 60 Minute frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
 layout(title = "Power Consumption April 16Th-22Nd, 2009",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

Leason: Depending on the minute we choose, we will see or not some important data points, like the high peaks consumption instants (over 70 watt/hour) shown in the 10 min frequency graph, that are gone in the 60 minutes graph above.

Moving on, let’s see a month worth of data, with measurements every 6 hours, choosing an arbitrary minute.

## Subset April 2009 - 6 hour frequency
houseWeek16 <- power_df %>%
  filter(year == 2009 & month == 4 & (hour == 0 | hour == 6 | hour == 12 | hour == 18) & (minute == 23))

## Plot sub-meter 1, 2 and 3 with title, legend and labels - 3 hour frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
 layout(title = "Power Consumption for April, 2009",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

The Water Heater & AC seem to introduce a lot of noise, let’s see only the kitchen and Laundry Room

This way we can tell how many types a week the is someone cooking and/or doing laundry in a month:

## Subset the 3rd-4th weeks of April 2009 - 6 hour frequency
houseWeek16 <- power_df %>%
  filter(year == 2009 & month == 4 & (hour == 0 | hour == 6 | hour == 12 | hour == 18) & (minute == 23))

## Plot sub-meter 1, 2 and 3 with title, legend and labels - 3 hour frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
 add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
 layout(title = "Power Consumption April, 2009, samples every 6 hours",
 xaxis = list(title = "Time"),
 yaxis = list (title = "Power (watt-hours)"))

Data preparation

Let’s take a sample of one observation per week: on Mondays at 8:00pm for 2007, 2008 and 2009, and plot a time series for each sub-meter.

## Subset to one observation per week on Mondays at 8:00pm for 2007, 2008 and 2009
house070809weekly <- filter(power_df, weekday == 2 & hour == 20 & minute == 1)

Plot of Sub-meter 3 (Water Heater & AC) time series

## Create TS object with SubMeter3
tsSM3_070809weekly <- ts(house070809weekly$Sub_metering_3, frequency=52, start=c(2007,1))

## Plot sub-meter 3 with autoplot (you may need to install these packages)
library(ggplot2)
library(ggfortify)

## Plot sub-meter 3 with autoplot - add labels, color
autoplot(tsSM3_070809weekly, colour = 'green', xlab = "Time", ylab = "Watt Hours", main = "Water Heater & AC", lwd=1)

Plot of Sub-meter 1 (Kitchen) time series

## Create TS object with SubMeter1
tsSM1_070809weekly <- ts(house070809weekly$Sub_metering_1, frequency=52, start=c(2007,1))

## Plot sub-meter 1 with autoplot - add labels, color
autoplot(tsSM1_070809weekly, colour = 'blue', xlab = "Time", ylab = "Watt Hours", main = "Kitchen", lwd=1)

Plot of Sub-meter 2 (Laundry Room) time series

## Create TS object with SubMeter1
tsSM2_070809weekly <- ts(house070809weekly$Sub_metering_2, frequency=52, start=c(2007,1))

## Plot sub-meter 1 with autoplot - add labels, color
autoplot(tsSM2_070809weekly, colour = 'orange', xlab = "Time", ylab = "Watt Hours", main = "Laundry Room", lwd=1)

Forecasting TS

Now with the available data, we’ll try to forecast the power consumption for each sub-meter, using a linear regression model first.

Linear model for sub-meter 3:

## Apply time series linear regression to the sub-meter 3 ts object and use summary to obtain R2 and RMSE from the model you built
library(forecast)
fitSM3 <- tslm(tsSM3_070809weekly ~ trend + season) 
summary(fitSM3)
## 
## Call:
## tslm(formula = tsSM3_070809weekly ~ trend + season)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.9673  -5.6994  -0.0327   4.0327  13.3006 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  13.200306   4.232597   3.119  0.00235 **
## trend         0.000629   0.015100   0.042  0.96685   
## season2      -6.900941   6.214791  -1.110  0.26938   
## season3     -13.234903   6.213892  -2.130  0.03554 * 
## season4     -13.235532   6.213030  -2.130  0.03551 * 
## season5      -7.236161   6.212204  -1.165  0.24675   
## season6     -13.236790   6.211415  -2.131  0.03544 * 
## season7     -13.237419   6.210663  -2.131  0.03541 * 
## season8      -7.238048   6.209947  -1.166  0.24646   
## season9      -7.238677   6.209267  -1.166  0.24637   
## season10     -1.905973   6.208625  -0.307  0.75947   
## season11     -7.573269   6.208019  -1.220  0.22526   
## season12    -13.240564   6.207450  -2.133  0.03528 * 
## season13     -7.241193   6.206917  -1.167  0.24603   
## season14    -13.241822   6.206421  -2.134  0.03523 * 
## season15     -6.909118   6.205962  -1.113  0.26814   
## season16    -12.576414   6.205539  -2.027  0.04526 * 
## season17     -6.910376   6.205153  -1.114  0.26800   
## season18     -7.244339   6.204804  -1.168  0.24566   
## season19     -3.911634   6.204492  -0.630  0.52978   
## season20     -6.912263   6.204216  -1.114  0.26779   
## season21     -6.912892   6.203977  -1.114  0.26773   
## season22    -12.913521   6.203775  -2.082  0.03984 * 
## season23     -1.580817   6.203610  -0.255  0.79936   
## season24    -13.248113   6.203481  -2.136  0.03506 * 
## season25     -1.248742   6.203390  -0.201  0.84086   
## season26     -1.249371   6.203334  -0.201  0.84078   
## season27     -7.250000   6.203316  -1.169  0.24518   
## season28    -12.917296   6.203334  -2.082  0.03977 * 
## season29    -12.917925   6.203390  -2.082  0.03976 * 
## season30     -7.251887   6.203481  -1.169  0.24507   
## season31     -6.919183   6.203610  -1.115  0.26727   
## season32    -12.586479   6.203775  -2.029  0.04503 * 
## season33    -12.587108   6.203977  -2.029  0.04503 * 
## season34    -12.921070   6.204216  -2.083  0.03974 * 
## season35     -6.588366   6.204492  -1.062  0.29075   
## season36     -1.255661   6.204804  -0.202  0.84002   
## season37    -12.922957   6.205153  -2.083  0.03974 * 
## season38      1.743081   6.205539   0.281  0.77935   
## season39    -12.590882   6.205962  -2.029  0.04503 * 
## season40     -6.924844   6.206421  -1.116  0.26710   
## season41     -7.258807   6.206917  -1.169  0.24489   
## season42     -1.259436   6.207450  -0.203  0.83962   
## season43     -6.926731   6.208019  -1.116  0.26709   
## season44     -7.594027   6.208625  -1.223  0.22404   
## season45      4.072011   6.209267   0.656  0.51340   
## season46     -6.928618   6.209947  -1.116  0.26711   
## season47     -6.929247   6.210663  -1.116  0.26712   
## season48      4.403457   6.211415   0.709  0.47995   
## season49    -12.930506   6.212204  -2.081  0.03985 * 
## season50      0.735532   6.213030   0.118  0.90599   
## season51     -6.598430   6.213892  -1.062  0.29075   
## season52     -6.932393   6.214791  -1.115  0.26722   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.122 on 104 degrees of freedom
## Multiple R-squared:  0.3544, Adjusted R-squared:  0.03167 
## F-statistic: 1.098 on 52 and 104 DF,  p-value: 0.3383

Now lets create and plot a forecast for sub-meter 3.

## Create the forecast for sub-meter 3. Forecast ahead 20 time periods 
forecastfitSM3 <- forecast(fitSM3, h=20)
## Plot the forecast for sub-meter 3. 
autoplot(forecastfitSM3, xlab = "Year", ylab = "Watt Hours", main = "Water Heater & AC forecast")

The graph shows a few things to note: - The levels of confidence: 80% and 95% (showing the range at which there is an 85% or 95% probability that the forecasted value will fall into). - The levels of confidence shadow show negative values. This is because the confidence level shows a range of acceptance which is equidistant to an edge value greater than, and another one lower than 0. - Note that no forecasted values are below 0.

To correct this situation (of negative values), we can chance the limits of the plot, also we will update the confidence levels:

## Create sub-meter 3 forecast with confidence levels 80 and 90
forecastfitSM3c <- forecast(fitSM3, h=20, level=c(80,90))

## Plot sub-meter 3 forecast, limit y and add labels
plot(forecastfitSM3c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Water Heater & AC Forecast")

Now let’s forecast the other two sub-meters.

Linear model for sub-meter 2:

## Apply time series linear regression to the sub-meter 2 ts object and use summary to obtain R2 and RMSE from the model you built
fitSM2 <- tslm(tsSM2_070809weekly ~ trend + season) 
summary(fitSM2)
## 
## Call:
## tslm(formula = tsSM2_070809weekly ~ trend + season)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6667  -0.6246  -0.0421   0.0421  27.2913 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.1861071  3.8577383   0.048   0.9616  
## trend        0.0008088  0.0137625   0.059   0.9533  
## season2      0.4368859  5.6643798   0.077   0.9387  
## season3     -0.2305895  5.6635605  -0.041   0.9676  
## season4      0.4352684  5.6627746   0.077   0.9389  
## season5      0.4344596  5.6620220   0.077   0.9390  
## season6     11.7669842  5.6613027   2.078   0.0401 *
## season7      0.0995087  5.6606168   0.018   0.9860  
## season8     -0.2346334  5.6599643  -0.041   0.9670  
## season9     -0.2354421  5.6593452  -0.042   0.9669  
## season10     0.4304158  5.6587595   0.076   0.9395  
## season11    12.0962737  5.6582072   2.138   0.0349 *
## season12     0.0954649  5.6576883   0.017   0.9866  
## season13    -0.2386772  5.6572028  -0.042   0.9664  
## season14     0.7605140  5.6567508   0.134   0.8933  
## season15     0.0930386  5.6563323   0.016   0.9869  
## season16     0.0922298  5.6559472   0.016   0.9870  
## season17     0.4247544  5.6555955   0.075   0.9403  
## season18     0.0906123  5.6552774   0.016   0.9872  
## season19     1.4231368  5.6549927   0.252   0.8018  
## season20     0.0889947  5.6547415   0.016   0.9875  
## season21     0.4215193  5.6545238   0.075   0.9407  
## season22    -0.2459561  5.6543395  -0.043   0.9654  
## season23     0.7532351  5.6541888   0.133   0.8943  
## season24    -0.2475737  5.6540715  -0.044   0.9652  
## season25    12.4182842  5.6539878   2.196   0.0303 *
## season26    -0.2491912  5.6539376  -0.044   0.9649  
## season27    -0.2500000  5.6539208  -0.044   0.9648  
## season28     0.0825246  5.6539376   0.015   0.9884  
## season29     0.4150491  5.6539878   0.073   0.9416  
## season30    -0.2524263  5.6540715  -0.045   0.9645  
## season31     0.0800983  5.6541888   0.014   0.9887  
## season32     0.7459561  5.6543395   0.132   0.8953  
## season33    -0.2548526  5.6545238  -0.045   0.9641  
## season34    -0.2556614  5.6547415  -0.045   0.9640  
## season35     0.4101965  5.6549927   0.073   0.9423  
## season36     0.4093877  5.6552774   0.072   0.9424  
## season37     0.4085790  5.6555955   0.072   0.9425  
## season38    -0.2588965  5.6559472  -0.046   0.9636  
## season39     0.7402948  5.6563323   0.131   0.8961  
## season40     0.0728193  5.6567508   0.013   0.9898  
## season41     0.4053439  5.6572028   0.072   0.9430  
## season42     0.4045351  5.6576883   0.072   0.9431  
## season43    -0.2629403  5.6582072  -0.046   0.9630  
## season44     0.0695842  5.6587595   0.012   0.9902  
## season45    12.4021088  5.6593452   2.191   0.0307 *
## season46    -0.2653666  5.6599643  -0.047   0.9627  
## season47     1.0671579  5.6606168   0.189   0.8508  
## season48    13.3996825  5.6613027   2.367   0.0198 *
## season49     0.0655404  5.6620220   0.012   0.9908  
## season50    12.0647316  5.6627746   2.131   0.0355 *
## season51    -0.2694105  5.6635605  -0.048   0.9622  
## season52     6.0631141  5.6643798   1.070   0.2869  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.403 on 104 degrees of freedom
## Multiple R-squared:  0.3006, Adjusted R-squared:  -0.04917 
## F-statistic: 0.8594 on 52 and 104 DF,  p-value: 0.7245

Forecast plot for sub-meter 2:

## Create sub-meter 2 forecast with confidence levels 80 and 90
forecastfitSM2c <- forecast(fitSM2, h=20, level=c(80,90))

## Plot sub-meter 2 forecast, limit y and add labels
plot(forecastfitSM2c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Laundry Room Forecast")

Linear model for sub-meter 1:

## Apply time series linear regression to the sub-meter 2 ts object and use summary to obtain R2 and RMSE from the model you built
fitSM1 <- tslm(tsSM1_070809weekly ~ trend + season) 
summary(fitSM1)
## 
## Call:
## tslm(formula = tsSM1_070809weekly ~ trend + season)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.4268  -0.2399   0.0000   0.0935  26.0935 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.141984   4.986001  -0.028   0.9773   
## trend        0.001797   0.017788   0.101   0.9197   
## season2     13.044932   7.321026   1.782   0.0777 . 
## season3      0.043134   7.319967   0.006   0.9953   
## season4     12.708004   7.318951   1.736   0.0855 . 
## season5      0.372873   7.317978   0.051   0.9595   
## season6      0.037743   7.317049   0.005   0.9959   
## season7      0.035945   7.316162   0.005   0.9961   
## season8      0.034148   7.315319   0.005   0.9963   
## season9      0.032351   7.314519   0.004   0.9965   
## season10     0.030554   7.313762   0.004   0.9967   
## season11     0.695423   7.313048   0.095   0.9244   
## season12     0.360292   7.312377   0.049   0.9608   
## season13     0.358495   7.311750   0.049   0.9610   
## season14     7.690031   7.311166   1.052   0.2953   
## season15    12.688234   7.310625   1.736   0.0856 . 
## season16     0.019770   7.310127   0.003   0.9978   
## season17     0.017973   7.309673   0.002   0.9980   
## season18    13.016175   7.309261   1.781   0.0779 . 
## season19     0.014378   7.308893   0.002   0.9984   
## season20     0.012581   7.308569   0.002   0.9986   
## season21     9.677450   7.308287   1.324   0.1883   
## season22     0.008986   7.308049   0.001   0.9990   
## season23    24.340522   7.307854   3.331   0.0012 **
## season24    13.005392   7.307703   1.780   0.0780 . 
## season25     0.003595   7.307595   0.000   0.9996   
## season26     0.001797   7.307530   0.000   0.9998   
## season27    12.333333   7.307508   1.688   0.0945 . 
## season28    -0.001797   7.307530   0.000   0.9998   
## season29    -0.003595   7.307595   0.000   0.9996   
## season30    -0.005392   7.307703  -0.001   0.9994   
## season31     0.326144   7.307854   0.045   0.9645   
## season32    -0.008986   7.308049  -0.001   0.9990   
## season33    -0.010784   7.308287  -0.001   0.9988   
## season34    -0.012581   7.308569  -0.002   0.9986   
## season35     0.652289   7.308893   0.089   0.9291   
## season36    -0.016175   7.309261  -0.002   0.9982   
## season37    -0.017973   7.309673  -0.002   0.9980   
## season38     0.646897   7.310127   0.088   0.9297   
## season39    -0.021567   7.310625  -0.003   0.9977   
## season40     0.309969   7.311166   0.042   0.9663   
## season41    -0.025162   7.311750  -0.003   0.9973   
## season42    -0.026959   7.312377  -0.004   0.9971   
## season43    -0.028756   7.313048  -0.004   0.9969   
## season44    -0.030554   7.313762  -0.004   0.9967   
## season45    11.967649   7.314519   1.636   0.1048   
## season46    12.632519   7.315319   1.727   0.0872 . 
## season47    -0.035945   7.316162  -0.005   0.9961   
## season48     0.295591   7.317049   0.040   0.9679   
## season49    -0.039540   7.317978  -0.005   0.9957   
## season50     0.291996   7.318951   0.040   0.9683   
## season51    -0.043134   7.319967  -0.006   0.9953   
## season52    -0.044932   7.321026  -0.006   0.9951   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.568 on 104 degrees of freedom
## Multiple R-squared:  0.3375, Adjusted R-squared:  0.006197 
## F-statistic: 1.019 on 52 and 104 DF,  p-value: 0.4587

Forecast plot for sub-meter 1:

## Create sub-meter 1 forecast with confidence levels 80 and 90
forecastfitSM1c <- forecast(fitSM1, h=20, level=c(80,90))

## Plot sub-meter 1 forecast, limit y and add labels
plot(forecastfitSM1c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Kitchen Forecast")

plotForecastErrors <- function(forecasterrors)
{
  # make a histogram of the forecast errors:
  mybinsize <- IQR(forecasterrors)/4
  mysd <- sd(forecasterrors)
  mymin <- min(forecasterrors) - mysd*5
  mymax <- max(forecasterrors) + mysd*3
  # generate normally distributed data with mean 0 and standard deviation mysd
  mynorm <- rnorm(10000, mean=0, sd=mysd)
  mymin2 <- min(mynorm)
  mymax2 <- max(mynorm)
  if (mymin2 < mymin) { mymin <- mymin2 }
  if (mymax2 > mymax) { mymax <- mymax2 }
  # make a red histogram of the forecast errors, with the normally distributed data overlaid:
  mybins <- seq(mymin, mymax, mybinsize)
  hist(forecasterrors, col="red", freq=FALSE, breaks=mybins)
  # freq=FALSE ensures the area under the histogram = 1
  # generate normally distributed data with mean 0 and standard deviation mysd
  myhist <- hist(mynorm, plot=FALSE, breaks=mybins)
  # plot the normal curve as a blue line on top of the histogram of forecast errors:
  print(points(myhist$mids, myhist$density, type="l", col="blue", lwd=2))
}

Performance review

All three models perform similarly.

Linear models \(RSME\) and \(R^2\) (residuals) comparison:

rsme_list <- c(9.568, 7.403, 8.122)
rsquared_list <- c(0.3375, 0.3006, 0.3544)
model_comparison <- data.frame(rsme_list, rsquared_list)
colnames(model_comparison) <- c("RSME", "RSquared")
rownames(model_comparison) <- c("Kitchen Prediction", "Laundry Room Forecast Prediction", "Water Heater & AC Prediction")
model_comparison

Also it seems model’s forecast errors do not follow a normal distribution, with center in 0, which indicates the models could be improved further more.

Forecast error residuals plot for sub-meter 1

plotForecastErrors(forecastfitSM1c$residuals) # make a histogram

Forecast error residuals plot for sub-meter 2

plotForecastErrors(forecastfitSM2c$residuals) # make a histogram

Forecast error residuals plot for sub-meter 3

plotForecastErrors(forecastfitSM3c$residuals) # make a histogram

Seasonal adjustment

Time series decomposition of sub-meter 1

## Decompose Sub-meter 1 into trend, seasonal and remainder
components070809SM1weekly <- decompose(tsSM1_070809weekly)
## Plot decomposed sub-meter 1 
autoplot(components070809SM1weekly, main="Kitchen decomposition of additive time series")

## Check summary statistics for decomposed sub-meter 1
print("Seasonal")
## [1] "Seasonal"
summary(components070809SM1weekly$seasonal)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.59190 -2.10632 -1.84190 -0.01167 -1.71209 17.62925
print("Trend")
## [1] "Trend"
summary(components070809SM1weekly$trend)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.365   1.808   2.558   2.342   2.904   3.538      52
print("Random")
## [1] "Random"
summary(components070809SM1weekly$random)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -20.5331  -0.8937  -0.3312  -0.3120   0.2361  24.2650       52

Time series decomposition of sub-meter 2

## Decompose Sub-meter 2 into trend, seasonal and remainder
components070809SM2weekly <- decompose(tsSM2_070809weekly)
## Plot decomposed sub-meter 2 
autoplot(components070809SM2weekly, main="Laundry Room decomposition of additive time series")

## Check summary statistics for decomposed sub-meter 2
print("Seasonal")
## [1] "Seasonal"
summary(components070809SM2weekly$seasonal)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.102163 -2.025240 -1.525240 -0.009684 -1.015625 16.902644
print("Trend")
## [1] "Trend"
summary(components070809SM2weekly$trend)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.692   1.817   1.789   2.019   2.933      52
print("Random")
## [1] "Random"
summary(components070809SM2weekly$random)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -18.5950  -0.3353   0.1166   0.1166   0.5685  18.8281       52

Time series decomposition of sub-meter 3

## Decompose Sub-meter 3 into trend, seasonal and remainder
components070809SM3weekly <- decompose(tsSM3_070809weekly)
## Plot decomposed sub-meter 3 
autoplot(components070809SM3weekly, main = "Water Heater & AC Forecast decomposition of additive time series")

## Check summary statistics for decomposed sub-meter 3 
print("Seasonal")
## [1] "Seasonal"
summary(components070809SM3weekly$seasonal)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -6.37999 -6.01942  2.37481  0.07089  3.12481 11.97578
print("Trend")
## [1] "Trend"
summary(components070809SM3weekly$trend)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   5.288   5.769   6.106   6.046   6.308   6.981      52
print("Random")
## [1] "Random"
summary(components070809SM3weekly$random)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -13.0191  -0.4998   0.1252   0.1348   0.7454  13.2887       52

Holt-Winters Forecasting

Sub-meter 3 Holt-Winters Forecasting:

## Seasonal adjusting sub-meter 3 by subtracting the seasonal component & plot
tsSM3_070809Adjusted <- tsSM3_070809weekly - components070809SM3weekly$seasonal
autoplot(tsSM3_070809Adjusted, main = "Water Heater & AC Forecast Holt-Winters Forecasting")

## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
autoplot(decompose(tsSM3_070809Adjusted), main = "Adjusted Water Heater & AC decomposition of additive time series")

“Yes there is a seasonal line, but look at the scale for the seasonal section. -1e-15 through 5e-16. That’s a decimal with 15 zeros before 1. A very very small number indeed. For all practical purposes the seasonality has been removed.”

HoltWinters Simple Exponential Smoothing

## Holt Winters Exponential Smoothing & Plot
tsSM3_HW070809 <- HoltWinters(tsSM3_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM3_HW070809, ylim = c(0, 25), main="Water Heater & AC Holt-Winters filtering")

Perform a new forecast on the data w/o seasonal data:

## HoltWinters forecast & plot
tsSM3_HW070809for <- forecast(tsSM3_HW070809, h=25)
plot(tsSM3_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time - Water Heather and AC", main="Water Heater & AC Forecasts from Holt-Winters")

## Forecast HoltWinters with diminished confidence levels
tsSM3_HW070809forC <- forecast(tsSM3_HW070809, h=25, level=c(10,25))
## Plot only the forecasted area
plot(tsSM3_HW070809forC, ylim = c(0, 9), ylab= "Watt-Hours", xlab="Time - Water Heather and AC", start(2010), main = "Water Heater & AC Holt-Winters forecasts")

### TODO: how is this plot different from the forecasted plot from step three of the plan of attack? Which, if any, is more useful?

Sub-meter 2 Holt-Winters Forecasting:

## Seasonal adjusting sub-meter 2 by subtracting the seasonal component & plot
tsSM2_070809Adjusted <- tsSM2_070809weekly - components070809SM2weekly$seasonal
autoplot(tsSM2_070809Adjusted, main = "Laundry Room Holt-Winters Forecasting")

## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
autoplot(decompose(tsSM2_070809Adjusted), main = "Adjusted Laundry Room decomposition of additive time series")

HoltWinters Simple Exponential Smoothing

## Holt Winters Exponential Smoothing & Plot
tsSM2_HW070809 <- HoltWinters(tsSM2_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM2_HW070809, ylim = c(0, 25), main = "Laundry Room Holt-Winters filtering")

Perform a new forecast on the data w/o seasonal data:

## HoltWinters forecast & plot
tsSM2_HW070809for <- forecast(tsSM2_HW070809, h=25)
plot(tsSM2_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Laundry Room forecasts from Holt-Winters")

## Forecast HoltWinters with diminished confidence levels
tsSM2_HW070809forC <- forecast(tsSM2_HW070809, h=25, level=c(10,25))
## Plot only the forecasted area
plot(tsSM2_HW070809forC, ylim = c(0, 5), ylab= "Watt-Hours", xlab="Time", start(2010), main = "Laundry Room Holt-Winters forecasts")

Sub-meter 1 Holt-Winters Forecasting:

## Seasonal adjusting sub-meter 1 by subtracting the seasonal component & plot
tsSM1_070809Adjusted <- tsSM1_070809weekly - components070809SM1weekly$seasonal
autoplot(tsSM1_070809Adjusted, main = "Kitchen Holt-Winters Forecasting")

## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
autoplot(decompose(tsSM1_070809Adjusted), main = "Adjusted Kitchen decomposition of additive time series ")

HoltWinters Simple Exponential Smoothing

## Holt Winters Exponential Smoothing & Plot
tsSM1_HW070809 <- HoltWinters(tsSM1_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM1_HW070809, ylim = c(0, 25), main = "Kitchen Holt-Winters filtering")

Perform a new forecast on the data w/o seasonal data:

## HoltWinters forecast & plot
tsSM1_HW070809for <- forecast(tsSM1_HW070809, h=25)
plot(tsSM1_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Kitchen Holt-Winters Forecasts")

## Forecast HoltWinters with diminished confidence levels
tsSM1_HW070809forC <- forecast(tsSM1_HW070809, h=25, level=c(10,25))
## Plot only the forecasted area
plot(tsSM1_HW070809forC, ylim = c(0, 5), ylab= "Watt-Hours", xlab="Time", start(2010), main = "Kitchen Holt-Winters Forecasts")